sample_info_tbl = read_parquet("cptac_gbm_2021_sample_info.parquet")
dna_methyl = readRDS("cptac_gbm_2021_dna_methyl.rds")
rna = readRDS("cptac_gbm_2021_rnaseq.rds")
theme_set(theme_bw())

Quality control per sample

has_measurements = !is.na(assay(dna_methyl))
per_sample = colSums(has_measurements) / nrow(dna_methyl)
per_probe = rowSums(has_measurements) / ncol(dna_methyl)
per_sample |> head(3)
## C3L-00104 C3L-00365 C3L-00674 
## 0.4841965 0.9648767 0.9627425
per_probe |> head(3)
## cg21870274 cg09499020 cg16535257 
##  0.8762887  0.0000000  0.3195876
data_completeness_per_sample_tbl = (colSums(!is.na(assay(dna_methyl))) / nrow(dna_methyl)) |> 
    enframe(name = "sample", value = "completeness") |> 
    arrange(desc(completeness)) |> 
    mutate(sample = fct_reorder(sample, completeness))


ggplot(data_completeness_per_sample_tbl, aes(x = sample, y = completeness)) +
    geom_col() +
    scale_y_continuous(labels = scales::label_percent()) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
    labs(
        title = "DNA methylation data completeness per sample",
        x = NULL, 
        y = "% of probes have value"
    )

dna_methyl = dna_methyl[
    , 
    data_completeness_per_sample_tbl |> filter(completeness > 0.3) |> pull(sample)
]

Quality control per probe

data_completeness_tbl = (1 - rowSums(is.na(assay(dna_methyl))) / ncol(dna_methyl)) |>
    enframe(name = "probe_id", value = "completeness")

ggplot(data = data_completeness_tbl, 
       aes(x = completeness)) + 
    geom_histogram(bins = 50) +
    scale_x_continuous(labels = scales::label_percent()) +
    scale_y_continuous(labels = scales::label_comma()) +
    labs(title = "EPIC DNA methylation microarry data completeness", 
         x = "Data completeness (%)",
         y = "Number of probes")

See the genomic location of the missing probes. Count the precentage of bad probes per 1M bp window

genome_tiles = seqinfo(dna_methyl) |> 
    as("GRanges") |> 
    makeWindows(w = 1e6, short.keep = TRUE)
names(genome_tiles) = NULL
mcols(genome_tiles) = NULL
genome_tiles
## GRanges object with 3103 ranges and 0 metadata columns:
##          seqnames            ranges strand
##             <Rle>         <IRanges>  <Rle>
##      [1]     chr1         1-1000000      *
##      [2]     chr1   1000001-2000000      *
##      [3]     chr1   2000001-3000000      *
##      [4]     chr1   3000001-4000000      *
##      [5]     chr1   4000001-5000000      *
##      ...      ...               ...    ...
##   [3099]     chrY 54000001-55000000      *
##   [3100]     chrY 55000001-56000000      *
##   [3101]     chrY 56000001-57000000      *
##   [3102]     chrY 57000001-57227415      *
##   [3103]     chrM           1-16569      *
##   -------
##   seqinfo: 25 sequences from an unspecified genome; no seqlengths
lowqual_probes_gr = dna_methyl[data_completeness_tbl$completeness < 0.1] |> 
    rowRanges() 

lowqual_probes_gr |> 
    seqnames() |> 
    table() / seqlengths(dna_methyl)
## 
##         chr1         chr2         chr3         chr4         chr5         chr6 
## 7.334617e-06 7.799548e-06 7.463606e-06 7.507312e-06 7.816534e-06 1.095980e-05 
##         chr7         chr8         chr9        chr10        chr11        chr12 
## 5.127208e-06 8.619345e-06 4.313748e-06 1.335601e-05 7.965260e-06 9.889304e-06 
##        chr13        chr14        chr15        chr16        chr17        chr18 
## 5.832238e-06 7.921997e-06 6.343685e-06 4.859509e-06 7.819121e-06 5.014104e-06 
##        chr19        chr20        chr21        chr22         chrX         chrY 
## 4.981438e-06 3.631050e-06 5.245131e-06 3.109106e-06 4.626992e-06 4.193794e-07 
##         chrM 
## 0.000000e+00
genome_tiles$num_lowqual = countOverlaps(genome_tiles, lowqual_probes_gr)
genome_tiles$total = countOverlaps(genome_tiles, rowRanges(dna_methyl))
genome_tiles$lowqual_probe_freq = genome_tiles$num_lowqual / genome_tiles$total
genome_tiles
## GRanges object with 3103 ranges and 3 metadata columns:
##          seqnames            ranges strand | num_lowqual     total
##             <Rle>         <IRanges>  <Rle> |   <integer> <integer>
##      [1]     chr1         1-1000000      * |           1       210
##      [2]     chr1   1000001-2000000      * |           6      1481
##      [3]     chr1   2000001-3000000      * |           7      1357
##      [4]     chr1   3000001-4000000      * |           9      1364
##      [5]     chr1   4000001-5000000      * |           2       259
##      ...      ...               ...    ... .         ...       ...
##   [3099]     chrY 54000001-55000000      * |           0         0
##   [3100]     chrY 55000001-56000000      * |           0         0
##   [3101]     chrY 56000001-57000000      * |           0         1
##   [3102]     chrY 57000001-57227415      * |           0         0
##   [3103]     chrM           1-16569      * |           0         0
##          lowqual_probe_freq
##                   <numeric>
##      [1]         0.00476190
##      [2]         0.00405132
##      [3]         0.00515844
##      [4]         0.00659824
##      [5]         0.00772201
##      ...                ...
##   [3099]                NaN
##   [3100]                NaN
##   [3101]                  0
##   [3102]                NaN
##   [3103]                NaN
##   -------
##   seqinfo: 25 sequences from an unspecified genome; no seqlengths
col_fun = circlize::colorRamp2(
    c(0, 0.5),
    # range(genome_tiles$lowqual_probe_freq, na.rm = TRUE),
    hcl_palette = "viridis"
)
cm = ColorMapping(col_fun = col_fun)
lgd = color_mapping_legend(cm, plot = FALSE, title = "Low\nqual")

gtrellis_layout(
    n_track = 1, 
    ncol = 1, 
    track_axis = FALSE, 
    xpadding = c(0.15, 0),
    gap = unit(1, "mm"), 
    border = FALSE,
    asist_ticks = FALSE, 
    add_ideogram_track = TRUE, 
    ideogram_track_height = unit(2, "mm"),
    legend = lgd
)

add_track(genome_tiles, panel_fun = function(gr) {
    grid.rect(
        x = (start(gr) + end(gr))/2,
        y = unit(0.5, "npc"),
        width = width(gr),
        height = unit(0.8, "npc"),
        default.units = "native",
        gp = gpar(fill = col_fun(gr$lowqual_probe_freq), col = FALSE)
    )
})
add_track(track = 2, clip = FALSE, panel_fun = function(gr) {
    chr = get_cell_meta_data("name")
    if(chr == "chrY") {
        grid.lines(get_cell_meta_data("xlim"), unit(c(0, 0), "npc"), 
            default.units = "native")
    }
    grid.text(chr, x = 0, y = 0, just = c("left", "bottom"), gp = gpar(fontsize=12))
})

Genome wide DNA methylation landscape

Select probes with sufficient data

sufficient_data_probe_ids = data_completeness_tbl |> 
    filter(completeness >= 0.8) |> 
    pull(probe_id)
dna_methyl = dna_methyl[sufficient_data_probe_ids] |> 
    sort()
hist(rowSds(assay(dna_methyl), na.rm = TRUE))

dna_methyl_top_var = dna_methyl[order(rowSds(assay(dna_methyl)), decreasing = TRUE)[1:8000], ] |> 
    sort(ignore.strand=TRUE)
smaller_font = gpar(fontsize = 10)
ht_opt(
    fast_hclust = TRUE,

    heatmap_row_names_gp = gpar(fontsize = 6),
    heatmap_column_names_gp = smaller_font,
    heatmap_row_title_gp = smaller_font,
    heatmap_column_title_gp = smaller_font,
    
    simple_anno_size = unit(4, "mm")
)
beta_val_col_fun = circlize::colorRamp2(c(0, 1), hcl_palette = "viridis")

ht = Heatmap(
    t(assay(dna_methyl_top_var)),
    name = "beta_val",
    col = beta_val_col_fun,
    
    cluster_columns = FALSE,
    column_split = seqnames(dna_methyl_top_var),
    show_column_names = FALSE,
    
    use_raster = TRUE,
    raster_quality = 4,
    heatmap_legend_param = list(
        title = "Beta value",
        legend_direction = "horizontal",
        legend_width = unit(3, "cm")
    ),
)

draw(
    ht,
    background = "transparent",
    merge_legends = TRUE,
    heatmap_legend_side = "bottom"
)

cairo_pdf("figures/genome_wide_dna_methyl_heatmap.pdf", width = 14, height = 8)
draw(
    ht,
    background = "transparent",
    merge_legends = TRUE,
    heatmap_legend_side = "bottom",
)
dev.off()
## quartz_off_screen 
##                 2

X-inactivation

dna_methyl_chrX = dna_methyl_top_var |> 
    subset(seqnames == "chrX") 

XIST DDX3Y RPS4Y1

sex_related_genes_tbl = tibble(
    gene_name = c("XIST", "DDX3Y", "RPS4Y1"),
)
rna_sex_related = rna |>
    subset(gene_name %in% sex_related_genes_tbl$gene_name)

rownames(rna_sex_related) = rowRanges(rna_sex_related)$gene_name
rowRanges(rna_sex_related)
## GRanges object with 3 ranges and 2 metadata columns:
##          seqnames            ranges strand |   gene_name   gene_biotype
##             <Rle>         <IRanges>  <Rle> | <character>    <character>
##     XIST     chrX 73820649-73852723      - |        XIST         lncRNA
##   RPS4Y1     chrY   2841602-2932000      + |      RPS4Y1 protein_coding
##    DDX3Y     chrY 12904108-12920478      + |       DDX3Y protein_coding
##   -------
##   seqinfo: 25 sequences (1 circular) from hg38 genome
plot_tbl = assay(rna_sex_related, "tpm_unstranded") |> 
    t() |> 
    as_tibble(rownames = "sample_nickname") |> 
    pivot_longer(
        cols = -sample_nickname, 
        names_to = "gene_name", 
        values_to = "tpm"
    ) |> 
    mutate(
        gene_exp = log2(tpm + 1)
    ) |> 
    left_join(
        sample_info_tbl |> 
            select(
                sample_nickname = case_id,
                sex_from_clinical_data = gender
            ),
        by = "sample_nickname"
    )

ggplot(plot_tbl, aes(x = gene_name, y = gene_exp, color = sex_from_clinical_data)) +
    geom_quasirandom(
        groupOnX = TRUE,
        dodge.width = 0.8
    ) +
    guides(
        color = guide_legend(title = "Sex")
    ) +
    labs(
        title = "Expression of sex related genes",
        x = NULL,
        y = "Gene expression (log2 TPM)"
    )

ggsave("figures/demo_sex_related_gene_expression.png", width=6, height=5, dpi=300)
shared_samples = intersect(colnames(dna_methyl_chrX), colnames(rna_sex_related))

anno_df = sample_info_tbl |> 
    filter(sample_nickname %in% shared_samples) |> 
    column_to_rownames("sample_nickname")
anno_df = anno_df[shared_samples, ]

dna_methyl_chrX = dna_methyl_chrX[, rownames(anno_df)]
rna_sex_related = rna_sex_related[, rownames(anno_df)]
anno_df |> head()
ha = rowAnnotation(
    sex = anno_df$gender,
    
    col = list(
        sex = c(Female = "#8700F9", Male = "#00C4AA")
    ),
    
    annotation_legend_param = list(
        sex = list(
            title = "Sex"
        )
    )
)

ht_dna_methyl = Heatmap(
    t(assay(dna_methyl_chrX)),
    name = "dna_methyl_beta",
    col = beta_val_col_fun,
    
    show_column_names = FALSE,
    
    heatmap_legend_param = list(
        title = "DNA methyl.\nbeta value",
        legend_direction = "horizontal",
        legend_width = unit(2, "cm")
    ),
    
    use_raster = TRUE,
    raster_quality = 2
)

ht_rna = Heatmap(
    log2(t(assay(rna_sex_related)) + 1),
    name = "rna",
    col = circlize::colorRamp2(c(0 , 7), hcl_palette = "inferno"),
    
    cluster_columns = FALSE,
    heatmap_legend_param = list(
        title = "RNA expr.\n(log2 TPM)",
        legend_direction = "horizontal",
        legend_width = unit(2, "cm")
    ),
    width = unit(4, "mm") * 3,
    
    right_annotation = ha
)


ht_list = ht_dna_methyl + ht_rna
draw(
    ht_list, 
    merge_legends = TRUE,
    heatmap_legend_side = "right",
    ht_gap = unit(1, "mm")
)

cairo_pdf("figures/sex_heatmap.pdf", width = 10, height = 7)
draw(
    ht_list, 
    merge_legends = TRUE,
    heatmap_legend_side = "right",
    ht_gap = unit(1, "mm")
)
dev.off()
## quartz_off_screen 
##                 2
# svg("figures/sex_heatmap.svg", width = 12, height = 7)
# draw(
#     ht_list, 
#     merge_legends = TRUE,
#     heatmap_legend_side = "right",
#     ht_gap = unit(1, "mm")
# )
# dev.off()

Figure 1 like overview

top_variable_probe_ids = dna_methyl |> 
    # Exclude probes at sex chromosomes
    subset(!seqnames %in% c("chrX", "chrY", "chrM"), ) |> 
    assay() |> 
    rowSds() |> 
    order(decreasing = TRUE) |> 
    head(8000)

dna_methyl_top_var = dna_methyl[top_variable_probe_ids, shared_samples] |> 
    sort(ignore.strand = TRUE)

dna_methyl_top_var
## class: RangedSummarizedExperiment 
## dim: 8000 93 
## metadata(6): title doi ... data_workflow annotation_source
## assays(1): beta_val
## rownames(8000): cg23552821 cg27541454 ... cg13213810 cg04865531
## rowData names(6): probe_strand gene_names ... CGI CGI_position
## colnames(93): C3L-03390 C3L-03387 ... C3L-00104 C3N-02784
## colData names(8): sample_type case_id ... gdc_file_id md5sum
top_variable_genes = rna |> 
    # Exclude probes at sex chromosomes
    subset(!seqnames %in% c("chrX", "chrY", "chrM"), ) |> 
    assay() |> 
    rowSds() |> 
    order(decreasing = TRUE) |> 
    head(2000)

rna_top_var = rna[top_variable_genes, shared_samples] |> 
    sort(ignore.strand = TRUE)

# Use gene symbol
rownames(rna_top_var) = rowData(rna_top_var)$gene_name
rna_top_var
## class: RangedSummarizedExperiment 
## dim: 2000 93 
## metadata(6): title doi ... data_workflow annotation_source
## assays(3): tpm_unstranded stranded_second unstranded
## rownames(2000): MTATP6P1 MTCO3P12 ... PLXNB2 MAPK8IP2
## rowData names(2): gene_name gene_biotype
## colnames(93): C3L-03390 C3L-03387 ... C3L-00104 C3N-02784
## colData names(8): sample_type case_id ... gdc_file_id md5sum

Pre calculate the normalized expression matrix

assay(rna_top_var, "norm_expr") =
    log2(assay(rna_top_var) + 1) |> 
    t() |> 
    scale(center = TRUE, scale = TRUE) |> 
    t()

Define colors

my_colors = list(
    sex = c(Female = "#8700F9", Male = "#00C4AA"),
    rna = c(
        "IDH mutant" = "#2A9C6A", "Proneural" = "#EC6F95", 
        "Mesenchymal" = "#E8AB16", "Classical"= "#23BFE8"
    ),
    dna_methyl = c(
        "dm1" = "#FF80A0",
        "dm2" = "#D8A400",
        "dm3" = "#63C02A",
        "dm4" = "#00CBB6",
        "dm5" = "#16B5FF",
        "dm6" = "#E984FB",
        "low_qual" = "gray20"
    )
)
my_colors$multiomic = c(
    "IDH mutant" = my_colors$rna[["IDH mutant"]],
    "nmf1" = my_colors$rna[['Proneural']],
    "nmf2" = my_colors$rna[['Mesenchymal']],
    "nmf3" = my_colors$rna[['Classical']]
)
wang_cancer_cell_2017_markers = tibble(
    "Mesenchymal" = c("BCL3", "TGFBI", "ITGB1", "LOX", "COL1A2", "VDR", "IL6", "MMP7"),
    "Proneural" = c("GARBR3", "HOXD3", "ERBB3", "SOX10", "CDKN1C", "PDGFRA", "HDAC2", "EPHB1"),
    "Classical" = c("PTPRA", "ELOVL2", "SOX9", "PAX6", "CDH4", "SEPTIN11", "MEOX2", "FGFR3")
) |> 
    pivot_longer(cols = everything(), names_to = "subtype", values_to = "gene_name") |> 
    arrange(subtype)
wang_cancer_cell_2017_markers |> head()
gene_markers_tbl = wang_cancer_cell_2017_markers |> 
    inner_join(
        tibble(
            rna_row_idx = which(rownames(rna_top_var) %in% wang_cancer_cell_2017_markers$gene_name),
            gene_name = rowData(rna_top_var[rna_row_idx])$gene_name
        ),
        by = "gene_name"
    ) |> 
    arrange(rna_row_idx) |> 
    mutate(color = my_colors$rna[subtype])

gene_markers_tbl 
ha = rowAnnotation(
    sex = anno_df$gender,
    multiomic = anno_df$multiomic,
    rna = anno_df$rna_wang_cancer_cell_2017,
    dna_methyl = anno_df$dna_methyl,
    
    col = my_colors,
    annotation_name_side = "top",
    annotation_legend_param = list(
        sex = list(
            title = "Sex"
        ),
        multiomic = list(
            title = "Multiomic subtypes"
        ),
        rna = list(
            title = "RNA only subtypes"
        ),
        dna_methyl = list(
            title = "DNA methyl.\nonly subtypes"
        )
    )
)

marker_ha = columnAnnotation(
    known_marker = anno_mark(
        at = gene_markers_tbl$rna_row_idx,
        labels = gene_markers_tbl$gene_name,
        labels_gp = gpar(col = gene_markers_tbl$color),
        labels_rot = 45,
        padding = unit(3, "mm")
    )
)

ht_dna_methyl = Heatmap(
    dna_methyl_top_var |> assay() |> t(),
    name = "dna_methyl_beta",
    col = beta_val_col_fun,
    
    show_column_names = FALSE,
    cluster_columns = TRUE,
    clustering_distance_columns = "pearson",
    clustering_method_columns = "ward.D2",
    column_dend_reorder = TRUE,
    show_column_dend = FALSE,
    
    cluster_rows = TRUE,
    row_dend_reorder = TRUE,
    clustering_distance_rows = "pearson",
    clustering_method_rows = "ward.D2",
    show_row_dend = TRUE,
    
    heatmap_legend_param = list(
        title = "DNA methyl.\nbeta value",
        legend_direction = "horizontal",
        legend_width = unit(2, "cm")
    ),
    
    use_raster = TRUE,
    raster_quality = 2,
    
    left_annotation = ha,

    width = unit(7, "in")
)

ht_rna = Heatmap(
    rna_top_var |> assay("norm_expr") |> t(),
    col = circlize::colorRamp2(c(-2, 0, 2), colors = c("blue", "white", "red")),
    
    cluster_rows = FALSE,
    # clustering_distance_rows = "pearson",
    # clustering_method_rows = "ward.D2",
    row_dend_reorder = FALSE,
    
    show_column_names = FALSE,
    cluster_columns = TRUE,
    clustering_distance_columns = "pearson",
    clustering_method_columns = "ward.D2",
    show_column_dend = FALSE,
    column_dend_reorder = TRUE,
    
    heatmap_legend_param = list(
        title = "RNA expr.",
        legend_direction = "horizontal",
        legend_width = unit(2, "cm")
    ),
    
    top_annotation = marker_ha,
    
    use_raster = TRUE,
    raster_quality = 2,
    
    width = unit(4, "in")
)

ht_list = ht_dna_methyl + ht_rna

draw(
    ht_list,
    merge_legends = TRUE,
    heatmap_legend_side = "right",
    ht_gap = unit(1, "mm")
)

cairo_pdf("figures/data_overview_heatmap.pdf", width = 15, height = 9)
draw(
    ht_list, 
    merge_legends = TRUE,
    heatmap_legend_side = "right",
    ht_gap = unit(1, "mm")
)
dev.off()
## quartz_off_screen 
##                 2

Re-use color palettes

my_colors_tbl = my_colors |> 
    imap_dfr(function(x, name) {
        x |> 
            enframe('label', 'color') |> 
            mutate(column = .env$name)
    }) |> 
    select(column, label, color)

my_colors_tbl
my_colors.recovered = my_colors_tbl |> 
    split(my_colors_tbl$column) |> 
    map(function(data_tbl) {
        data_tbl |> 
            select(label, color) |> 
            deframe()
    })

my_colors.recovered
## $dna_methyl
##       dm1       dm2       dm3       dm4       dm5       dm6  low_qual 
## "#FF80A0" "#D8A400" "#63C02A" "#00CBB6" "#16B5FF" "#E984FB"  "gray20" 
## 
## $multiomic
## IDH mutant       nmf1       nmf2       nmf3 
##  "#2A9C6A"  "#EC6F95"  "#E8AB16"  "#23BFE8" 
## 
## $rna
##  IDH mutant   Proneural Mesenchymal   Classical 
##   "#2A9C6A"   "#EC6F95"   "#E8AB16"   "#23BFE8" 
## 
## $sex
##    Female      Male 
## "#8700F9" "#00C4AA"